Classification Experiments

Here we'll do some experiments with feature spaces and classificator evaluation.


In [1]:
import book_classification as bc
import pandas
import shelve
from sklearn import svm, decomposition, manifold, pipeline, cross_validation, metrics
import random
random.seed(1234)

In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
aBookCollection = aBookCollection.selection().remove_duplicates()
aDataFrame = aBookCollection.as_dataframe()
del myShelf

In [3]:
tokenizer = bc.BasicTokenizer()
frequency_extractor = bc.FrequenciesExtractor(tokenizer)
frequency_extractor = bc.PersistentExtractorWrapper(frequency_extractor, "frequencies")

entropy_extractor = bc.EntropiesExtractor(tokenizer, bc.FixedGrouper(500))
entropy_extractor = bc.PersistentExtractorWrapper(entropy_extractor, "entropies")

frequencies_hierarchial = bc.CollectionHierarchialFeatures.from_book_collection(aBookCollection, frequency_extractor)
entropies_hierarchial = bc.CollectionHierarchialFeatures.from_book_collection(aBookCollection, entropy_extractor)

Visualizing books in feature space

Entropies vs Frequencies


In [71]:
all_authors = list(aBookCollection.authors())
random.shuffle(all_authors)
grouper = bc.FixedGrouper(10)

books, authors = aBookCollection.as_arrays()
authors_indexer = bc.NumericIndexer(authors)
authors_matrix = [authors_indexer.encode(a) for a in authors]

In [68]:
matrix_extractor = bc.SklExtractor(frequency_extractor)
matrix_extractor.fit(books)
frequencies_matrix = matrix_extractor.transform(books)
frequencies_svd = decomposition.TruncatedSVD(30)
frequencies_svd_matrix = frequencies_svd.fit_transform(frequencies_matrix)

In [69]:
matrix_extractor = bc.SklExtractor(entropy_extractor)
matrix_extractor.fit(books)
entropies_matrix = matrix_extractor.transform(books)
entropies_svd = decomposition.TruncatedSVD(30)
entropies_svd_matrix = entropies_svd.fit_transform(entropies_matrix)

In [72]:
figure, axmatrix = plt.subplots(5, 2)
plt.figsize(10, 15)
for i,some in enumerate(grouper.parts_from(all_authors)):
    indices = [i for i,b in enumerate(books) if b.author() in some]
    colors = [authors_indexer.encode(books[i].author()) for i in indices]
    
    one = manifold.Isomap(30, 2)
    one_matrix = one.fit_transform(frequencies_svd_matrix[indices, :])
    axmatrix[i, 0].scatter(one_matrix[:, 0], one_matrix[:, 1], c=colors, cmap='Paired', s=50)
    
    two = manifold.Isomap(30, 2)
    two_matrix = two.fit_transform(entropies_svd_matrix[indices, :])
    axmatrix[i, 1].scatter(two_matrix[:, 0], two_matrix[:, 1], c=colors, cmap='Paired', s=50)


Visualizing authors in feature space

Frequencies


In [5]:
author_frequencies_matrix = frequencies_hierarchial.authors_features()
author_frequencies_svd = decomposition.TruncatedSVD(50)
author_frequencies_svd_matrix = author_frequencies_svd.fit_transform(author_frequencies_matrix)

In [46]:
U, Sigma, VT = author_frequencies_svd._fit(author_frequencies_matrix)
pandas.Series(Sigma)[:30].plot(kind="bar")


Out[46]:
<matplotlib.axes.AxesSubplot at 0x7f469e23e0d0>

In [53]:
sort(pandas.Series(U[:,0])).plot(kind='bar')


Out[53]:
<matplotlib.axes.AxesSubplot at 0x7f469e01ed50>

In [54]:
sort(pandas.Series(U[0,:])).plot(kind='bar')


Out[54]:
<matplotlib.axes.AxesSubplot at 0x7f469d574b10>

In [51]:
pandas.Series(VT[0,:]).hist(bins=50)


Out[51]:
<matplotlib.axes.AxesSubplot at 0x7f469d46f5d0>

In [12]:
scatter(author_frequencies_svd_matrix[:, 0], author_frequencies_svd_matrix[:, 1])


Out[12]:
<matplotlib.collections.PathCollection at 0x7f469e31aa90>

In [32]:
blah = manifold.Isomap(30, 2)
blah_matrix = blah.fit_transform(author_frequencies_svd_matrix)
sizes = [7*log(len(aBookCollection.books_by(author)))**2 for author in frequencies_hierarchial._by_author.keys()]
colors = range(len(sizes))
plt.figsize(10,6)
scatter(blah_matrix[:, 0], blah_matrix[:, 1], s=sizes, c=colors, cmap='Paired')


Out[32]:
<matplotlib.collections.PathCollection at 0x7f469d52f450>

Entropies


In [8]:
author_entropies_matrix = entropies_hierarchial.authors_features()
author_entropies_svd = decomposition.TruncatedSVD(50)
author_entropies_svd_matrix = author_entropies_svd.fit_transform(author_entropies_matrix)

In [55]:
U, Sigma, VT = author_entropies_svd._fit(author_entropies_matrix)
pandas.Series(Sigma)[:30].plot(kind="bar")


Out[55]:
<matplotlib.axes.AxesSubplot at 0x7f469d819450>

In [44]:
pandas.Series(U[:,0]).plot(kind='kde')


Out[44]:
<matplotlib.axes.AxesSubplot at 0x7f469db4af10>

In [58]:
sort(pandas.Series(U[0,:])).plot(kind='bar')


Out[58]:
<matplotlib.axes.AxesSubplot at 0x7f469e48f650>

In [45]:
pandas.Series(VT[0,:]).hist(bins=20)


Out[45]:
<matplotlib.axes.AxesSubplot at 0x7f469d64ecd0>

In [35]:
scatter(author_entropies_svd_matrix[:, 0], author_entropies_svd_matrix[:, 1])


Out[35]:
<matplotlib.collections.PathCollection at 0x7fa14149ac10>

In [30]:
blah = manifold.Isomap(30, 2)
blah_matrix = blah.fit_transform(author_entropies_svd_matrix)
sizes = [7*log(len(aBookCollection.books_by(author)))**2 for author in frequencies_hierarchial._by_author.keys()]
colors = range(len(sizes))
plt.figsize(10,6)
scatter(blah_matrix[:, 0], blah_matrix[:, 1], s=sizes, c=colors, cmap='Paired')


Out[30]:
<matplotlib.collections.PathCollection at 0x7f469e330fd0>

Variation of accuracy

Frequencies


In [4]:
#extractor = bc.EntropiesExtractor(tokenizer, bc.FixedGrouper(500))
#extractor = bc.CachedExtractorWrapper(frequency_extractor)
#classification_model = bc.ClassificationModel(frequency_extractor, decomposition.TruncatedSVD(30), svm.SVC())
classification_model = bc.ClassificationModelFixedVoc(frequency_extractor,
    decomposition.TruncatedSVD(30), svm.SVC(), frequencies_hierarchial.total().keys())

In [5]:
experiment_series = bc.ESOverAuthorsCount(aBookCollection, classification_model)
config = {'num_books': 8, 'num_trials': 6, 'training_percentage': 0.7, 'num_authors': 30}
results = experiment_series.run_experiment(config)

In [27]:
boxplot(results)
xticks(range(1, len(results)+1), range(2, len(results)+2))
hold(True)
plot(range(1, len(results)+1), [1/x for x in range(2, len(results)+2)], c='green')
plt.ylim([0,1])
None



In [7]:
experiment_series2 = bc.ESOverBiasedAuthorsCount(aBookCollection, classification_model)
config2 = {'min_books': 8, 'num_trials': 6, 'training_percentage': 0.7, 'num_authors': 30}
results2 = experiment_series2.run_experiment(config2)

In [28]:
boxplot(results2)
xticks(range(1, len(results2)+1), range(2, len(results2)+2))
hold(True)
plot(range(1, len(results2)+1), [sum(accs)/len(accs) for accs in experiment_series2.min_acc()], c='green')
plt.ylim([0,1])
None



In [9]:
another_experiment_series2 = bc.ESOverTrainingProportion(aBookCollection, classification_model)
config = {'num_books': 10, 'num_trials': 6, 'num_steps': 10, 'num_authors': 10}
another_results2 = another_experiment_series2.run_experiment(config)

In [26]:
boxplot(another_results2)
xticks(range(1, 10), [x/10 for x in range(1, 10)])
hold(True)
plot(range(1, len(another_results2)+1), [1/10]*len(another_results2), c='green')
plt.ylim([0,1])
None



In [11]:
third_experiment_series = bc.ESOverBiasedTrainingProportion(aBookCollection, classification_model)
config = {'min_books': 8, 'num_trials': 6, 'num_steps': 10, 'num_authors': 10}
third_results = third_experiment_series.run_experiment(config)

In [25]:
boxplot(third_results)
xticks(range(1, 10), [x/10 for x in range(1, 10)])
hold(True)
plot(range(1, len(third_results)+1), [sum(accs)/len(accs) for accs in third_experiment_series.min_acc()], c='green')
plt.ylim([0,1])
None


Entropies


In [13]:
#extractor = bc.EntropiesExtractor(tokenizer, bc.FixedGrouper(500))
#extractor = bc.CachedExtractorWrapper(frequency_extractor)
#classification_model = bc.ClassificationModel(entropy_extractor, decomposition.TruncatedSVD(30), svm.SVC())
classification_model_ent = bc.ClassificationModelFixedVoc(entropy_extractor,
    decomposition.TruncatedSVD(30), svm.SVC(), entropies_hierarchial.total().keys())

In [14]:
experiment_series_ent = bc.ESOverAuthorsCount(aBookCollection, classification_model_ent)
config = {'num_books': 8, 'num_trials': 6, 'training_percentage': 0.7, 'num_authors': 30}
results_ent = experiment_series_ent.run_experiment(config)

In [29]:
boxplot(results_ent)
xticks(range(1, len(results_ent)+1), range(2, len(results_ent)+2))
hold(True)
plot(range(1, len(results_ent)+1), [1/x for x in range(2, len(results_ent)+2)], c='green')
plt.ylim([0,1])
None



In [16]:
experiment_series_ent2 = bc.ESOverBiasedAuthorsCount(aBookCollection, classification_model_ent)
config = {'min_books': 8, 'num_trials': 6, 'training_percentage': 0.7, 'num_authors': 30}
results_ent2 = experiment_series_ent2.run_experiment(config)

In [30]:
boxplot(results_ent2)
xticks(range(1, len(results_ent2)+1), range(2, len(results_ent2)+2))
hold(True)
plot(range(1, len(results_ent2)+1), [sum(accs)/len(accs) for accs in experiment_series_ent2.min_acc()], c='green')
plt.ylim([0,1])
None



In [18]:
another_experiment_series = bc.ESOverTrainingProportion(aBookCollection, classification_model_ent)
config = {'num_books': 10, 'num_trials': 6, 'num_steps': 10, 'num_authors': 10}
another_results = another_experiment_series.run_experiment(config)

In [31]:
boxplot(another_results)
xticks(range(1, 10), [x/10 for x in range(1, 10)])
hold(True)
plot(range(1, len(another_results2)+1), [1/10]*len(another_results2), c='green')
plt.ylim([0,1])
None



In [20]:
another_experiment_series3 = bc.ESOverBiasedTrainingProportion(aBookCollection, classification_model_ent)
config = {'min_books': 8, 'num_trials': 6, 'num_steps': 10, 'num_authors': 10}
another_results3 = another_experiment_series3.run_experiment(config)

In [32]:
boxplot(another_results3)
xticks(range(1, 10), [x/10 for x in range(1, 10)])
hold(True)
plot(range(1, len(another_results3)+1), [sum(accs)/len(accs) for accs in another_experiment_series3.min_acc()], c='green')
plt.ylim([0,1])
None



In [ ]: